{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-01-13T13:42:59.153417Z",
     "start_time": "2025-01-13T13:42:58.332292Z"
    }
   },
   "source": [
    "from sklearn.feature_extraction import DictVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.decomposition import PCA\n",
    "import jieba\n",
    "import numpy as np\n",
    "from sklearn.impute import SimpleImputer\n",
    "#导入sklearn模块与包"
   ],
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "#### 特征处理，不同的特征拉到到同一个量纲",
   "id": "e84ebb2680625cf0"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-13T13:45:41.195798Z",
     "start_time": "2025-01-13T13:45:41.186327Z"
    }
   },
   "cell_type": "code",
   "source": [
    "def mm():\n",
    "    \"\"\"\n",
    "    归一化处理             ！！！！容易收到异常值（非常大或者非常小的数）的影响\n",
    "    :return: NOne\n",
    "    \"\"\"\n",
    "    # 归一化缺点 容易受极值的影响\n",
    "    #feature_range代表特征值范围，一般设置为(0,1),或者(-1,1),默认是(0,1)，公式为（m,n)对应 f*（m-n)-n\n",
    "    mm = MinMaxScaler(feature_range=(0, 1))\n",
    "\n",
    "    data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])       #归一化后每一列最大值即为1，最小值即为0\n",
    "\n",
    "    print(data)\n",
    "    print('-' * 50)\n",
    "    out = mm.transform([[1, 2, 3, 4], [6, 5, 8, 7]])    #用训练集已经选定的最大最小值对测试集进行操作\n",
    "    print(out)\n",
    "    return None\n",
    "    #transform和fit_transform不同是，transform用于测试集，而且不会重新找最小值和最大值\n",
    "\n",
    "\n",
    "mm()"
   ],
   "id": "86586b0357e7c892",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1.         0.         0.         0.        ]\n",
      " [0.         1.         1.         0.83333333]\n",
      " [0.5        0.5        0.6        1.        ]]\n",
      "--------------------------------------------------\n",
      "[[-1.96666667  0.         -1.4        -6.        ]\n",
      " [-1.8         1.5        -0.4        -5.5       ]]\n"
     ]
    }
   ],
   "execution_count": 2
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-13T13:50:33.000180Z",
     "start_time": "2025-01-13T13:50:32.991010Z"
    }
   },
   "cell_type": "code",
   "source": "(1 - 60) / 30         #out中（0，0）的-1.96666667来源，60为最大值，30为最大最小值的差值",
   "id": "1cc399b39ba7b621",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-1.9666666666666666"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 3
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-13T13:54:37.504843Z",
     "start_time": "2025-01-13T13:54:37.496181Z"
    }
   },
   "cell_type": "code",
   "source": [
    "def stand():\n",
    "    \"\"\"\n",
    "    标准化缩放，不是标准正态分布，只均值为0，方差为1的分布       \n",
    "    ！！！！ 不易受异常值影响，但如果突然加入较多数量的新样本，结果偏差可能会变大\n",
    "    \"\"\"\n",
    "    std = StandardScaler()\n",
    "\n",
    "    data = std.fit_transform([[1., -1., 3.],\n",
    "                              [2., 4., 2.],\n",
    "                              [4., 6., -1.]])\n",
    "\n",
    "    print(data)\n",
    "    print('-' * 50)\n",
    "    print(std.mean_) #均值\n",
    "    print('-' * 50)\n",
    "    print(std.var_)  #方差\n",
    "    print(std.n_samples_seen_)  # 样本数\n",
    "    return data\n",
    "\n",
    "\n",
    "data = stand()"
   ],
   "id": "c429117cf5e8eb0f",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-1.06904497 -1.35873244  0.98058068]\n",
      " [-0.26726124  0.33968311  0.39223227]\n",
      " [ 1.33630621  1.01904933 -1.37281295]]\n",
      "--------------------------------------------------\n",
      "[2.33333333 3.         1.33333333]\n",
      "--------------------------------------------------\n",
      "[1.55555556 8.66666667 2.88888889]\n",
      "3\n"
     ]
    }
   ],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-13T13:55:46.327370Z",
     "start_time": "2025-01-13T13:55:46.321314Z"
    }
   },
   "cell_type": "code",
   "source": "type(data)    ",
   "id": "3ff9dc6a869c82b0",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "numpy.ndarray"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 5
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-13T13:56:22.134630Z",
     "start_time": "2025-01-13T13:56:22.129556Z"
    }
   },
   "cell_type": "code",
   "source": [
    "std1 = StandardScaler()\n",
    "#为了证明上面输出的结果的均值是为0的，方差为1，但并不是标准正态分布，而是由他数学公式推导出的性质\n",
    "data1 = std1.fit_transform([[-1.06904497, -1.35873244, 0.98058068],          #将data标准化后的数据复制过来，观察性质\n",
    "                            [-0.26726124, 0.33968311, 0.39223227],\n",
    "                            [1.33630621, 1.01904933, -1.37281295]])\n",
    "# print(data1)  #这个并不是我们想看的，没意义\n",
    "# 均值\n",
    "print(std1.mean_)\n",
    "# 方差\n",
    "print(std1.var_)"
   ],
   "id": "ea50dbb9a7c9ed3",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-1.06904497 -1.35873244  0.98058068]\n",
      " [-0.26726124  0.33968311  0.39223227]\n",
      " [ 1.33630621  1.01904933 -1.37281295]]\n",
      "[0. 0. 0.]\n",
      "[1.         1.         1.00000001]\n"
     ]
    }
   ],
   "execution_count": 6
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "",
   "id": "f26037c9e62467e0"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
